import jax.numpy as jnp
import jax
from jax.experimental.host_callback import id_print
import matplotlib.pyplot as plt
from jax.lax import scan
from jax import grad, jit, vmap
import jax.random as random
from functools import partial
rng = random.PRNGKey(2022)

import scipy

import seaborn as sns
sns.set_style("darkgrid")
cm = sns.color_palette("mako_r", as_cmap=True)


def sample_sphere(J):
    """
    2 dimensional sample
    
    N_samples: Number of samples
    Returns a (N_samples, 2) array of samples
    """
    alphas = jnp.linspace(0, 2*jnp.pi * (1 - 1/J), J)
    xs = jnp.cos(alphas)
    ys = jnp.sin(alphas)
    mf = jnp.stack([xs, ys], axis=1)
    return mf


J = 8
mf = sample_sphere(J)
plt.scatter(mf[:, 0], mf[:, 1])

<matplotlib.collections.PathCollection at 0x7fba60633ca0>


beta_min = 0.001
beta_max = 3

def beta_t(t):
    """
    t: time (number)
    returns beta_t as explained above
    """
    return beta_min + t*(beta_max - beta_min)

def alpha_t(t):
    """
    t: time (number)
    returns alpha_t as explained above
    """
    return t*beta_min + 0.5 * t**2 * (beta_max - beta_min)

def drift(x, t):
    """
    x: location of J particles in N dimensions, shape (J, N)
    t: time (number)
    returns the drift of a time-changed OU-process for each batch member, shape (J, N)
    """
    return -0.5*beta_t(t)*x

def dispersion(t):
    """
    t: time (number)
    returns the dispersion
    """
    return jnp.sqrt(beta_t(t))

def mean_factor(t):
    """
    t: time (number)
    returns m_t as above
    """
    return jnp.exp(-0.5 * alpha_t(t))

def var(t):
    """
    t: time (number)
    returns v_t as above
    """
    return 1 - jnp.exp(-alpha_t(t))


from jax.scipy.special import logsumexp

def log_hat_pt(x, t):
    """
    x: One location in R^n
    t: time (number)
    returns the log density log \hat{p}_t(x) as described above
    
    """
    N = mf.shape[0]
    means = mf * mean_factor(t)
    v = var(t)
    potentials = jnp.sum(-(x - means)**2 / (2 * v), axis=1)
    # this is equivalent to
    # return jnp.log(1/N * jnp.sum(jnp.exp(potentials)))
    # but is numerically more stable
    return logsumexp(potentials, axis=0, b=1/N)

nabla_log_hat_pt = jit(vmap(grad(log_hat_pt), in_axes=(0, 0), out_axes=(0)))


def plot_score(score, t, area_min=-1, area_max=1):
    #this helper function is here so that we can jit it.
    #We can not jit the whole function since plt.quiver cannot
    #be jitted
    @partial(jit, static_argnums=[0,])
    def helper(score, t, area_min, area_max):
        x = jnp.linspace(area_min, area_max, 16)
        x, y = jnp.meshgrid(x, x)
        grid = jnp.stack([x.flatten(), y.flatten()], axis=1)
        t = jnp.ones((grid.shape[0], 1)) * t
        scores = score(grid, t)
        return grid, scores
    grid, scores = helper(score, t, area_min, area_max)
    plt.quiver(grid[:, 0], grid[:, 1], scores[:, 0], scores[:, 1])

plot_score(nabla_log_hat_pt, 0.001, -2, 2)


R = 1000
train_ts = jnp.arange(1, R)/(R-1)

#we jit the function, but we have to mark some of the arguments as static,
#which means the function is recompiled every time these arguments are changed,
#since they are directly compiled into the binary code. This is necessary
#since jitted-functions cannot have functions as arguments. But it also 
#no problem since these arguments will never/rarely change in our case,
#therefore not triggering re-compilation.
@partial(jit, static_argnums=[1,2,3,4,5])
def reverse_sde(rng, N, n_samples, forward_drift, dispersion, score, ts=train_ts):
    """
    rng: random number generator (JAX rng)
    N: dimension in which the reverse SDE runs
    N_initial: How many samples from the initial distribution N(0, I), number
    forward_drift: drift function of the forward SDE (we implemented it above)
    disperion: dispersion function of the forward SDE (we implemented it above)
    score: The score function to use as additional drift in the reverse SDE
    ts: a discretization {t_i} of [0, T], shape 1d-array
    """
    def f(carry, params):
        t, dt = params
        x, rng = carry
        rng, step_rng = jax.random.split(rng)
        disp = dispersion(1-t)
        t = jnp.ones((x.shape[0], 1)) * t
        drift = -forward_drift(x, 1-t) + disp**2 * score(x, 1-t)
        noise = random.normal(step_rng, x.shape)

        x = x + dt * drift + jnp.sqrt(dt)*disp*noise
        return (x, rng), ()
    
    rng, step_rng = random.split(rng)
    initial = random.normal(step_rng, (n_samples, N))
    dts = ts[1:] - ts[:-1]
    params = jnp.stack([ts[:-1], dts], axis=1)
    #scan is jax version of a for-loop (can be compiled to high-performance code cheaply)
    (x, _), _ = scan(f, (initial, rng), params)
    return x


def plot_heatmap(positions, area_min=-2, area_max=2):
    """
    positions: locations of all particles in R^2, array (J, 2)
    area_min: lowest x and y coordinate
    area_max: highest x and y coordinate
    
    will plot a heatmap of all particles in the area [area_min, area_max] x [area_min, area_max]
    """
    def small_kernel(z, area_min, area_max):
        a = jnp.linspace(area_min, area_max, 512)
        x, y = jnp.meshgrid(a, a)
        dist = (x - z[0])**2 + (y - z[1])**2
        hm = jnp.exp(-350*dist)
        return hm

    #again we try to jit most of the code, but use the helper functions
    #since we cannot jit all of it because of the plt functions
    @jit
    def produce_heatmap(positions, area_min, area_max):
        return jnp.sum(vmap(small_kernel, in_axes=(0, None, None))(positions, area_min, area_max), axis=0)

    hm = produce_heatmap(positions, area_min, area_max) #np.sum(vmap(small_kernel)(to_plot), axis=0)
    extent = [area_min, area_max, area_max, area_min]
    plt.imshow(hm, cmap=cm, interpolation='nearest', extent=extent)
    ax = plt.gca()
    ax.invert_yaxis()


rng, step_rng = random.split(rng)
samples = reverse_sde(step_rng, 2, 5000, drift, dispersion, nabla_log_hat_pt)
plot_heatmap(samples[:, [0,1]], -3, 3)


perturbed_score = lambda x, t: nabla_log_hat_pt(x, t) + 1
rng, step_rng = random.split(rng)
samples = reverse_sde(step_rng, 2, 5000, drift, dispersion, perturbed_score)
plot_heatmap(samples[:, [0,1]], -3, 3)


import flax.linen as nn

class ApproximateScore(nn.Module):
    """A simple model with multiple fully connected layers and some fourier features for the time variable."""

    @nn.compact
    def __call__(self, x, t):
        in_size = x.shape[1]
        n_hidden = 256
        act = nn.relu
        # t = jnp.concatenate([t - 0.5, jnp.cos(2*jnp.pi*t)],axis=1)
        t = jnp.concatenate([t - 0.5, jnp.cos(2*jnp.pi*t), jnp.sin(2*jnp.pi*t), -jnp.cos(4*jnp.pi*t)],axis=1)
        x = jnp.concatenate([x, t],axis=1)
        x = nn.Dense(n_hidden)(x)
        x = nn.relu(x)
        x = nn.Dense(n_hidden)(x)
        x = nn.relu(x)
        x = nn.Dense(n_hidden)(x)
        x = nn.relu(x)
        x = nn.Dense(in_size)(x)
        return x


import optax

batch_size = 16
#some dummy input data. Flax is able to infer all the dimensions of the weights
#if we supply if with the kind of input data it has to expect
x = jnp.zeros(2*batch_size).reshape((batch_size, 2))
time = jnp.ones((batch_size, 1))
#initialize the model weights
score_model = ApproximateScore()
params = score_model.init(rng, x, time)

#Initialize the optimizer
optimizer = optax.adam(1e-3)
opt_state = optimizer.init(params)


def loss_fn(params, model, rng, batch):
    """
    params: the current weights of the model
    model: the score function
    rng: random number generator from jax
    batch: a batch of samples from the training data, representing samples from \mu_text{data}, shape (J, N)
    
    returns an random (MC) approximation to the loss \bar{L} explained above
    """
    rng, step_rng = random.split(rng)
    N_batch = batch.shape[0]
    t = random.randint(step_rng, (N_batch,1), 1, R)/(R-1)
    mean_coeff = mean_factor(t)
    #is it right to have the square root here for the loss?
    vs = var(t)
    stds = jnp.sqrt(vs)
    rng, step_rng = random.split(rng)
    noise = random.normal(step_rng, batch.shape)
    xt = batch * mean_coeff + noise * stds
    output = score_model.apply(params, xt, t)
    loss = jnp.mean((noise + output*stds)**2)
    return loss

@partial(jit, static_argnums=[4])
def update_step(params, rng, batch, opt_state, model):
    """
    params: the current weights of the model
    rng: random number generator from jax
    batch: a batch of samples from the training data, representing samples from \mu_text{data}, shape (J, N)
    opt_state: the internal state of the optimizer
    model: the score function

    takes the gradient of the loss function and updates the model weights (params) using it. Returns
    the value of the loss function (for metrics), the new params and the new optimizer state
    """
    val, grads = jax.value_and_grad(loss_fn)(params, model, rng, batch)
    updates, opt_state = optimizer.update(grads, opt_state)
    params = optax.apply_updates(params, updates)
    return val, params, opt_state


N_epochs = 10_000
train_size = mf.shape[0]
batch_size = train_size
# batch_size = 5
steps_per_epoch = train_size // batch_size
losses = []
for k in range(N_epochs):
    rng, step_rng = random.split(rng)
    perms = jax.random.permutation(step_rng, train_size)
    perms = perms[:steps_per_epoch * batch_size]  # skip incomplete batch
    perms = perms.reshape((steps_per_epoch, batch_size))
    for perm in perms:
        batch = mf[perm, :]
        rng, step_rng = random.split(rng)
        loss, params, opt_state = update_step(params, step_rng, batch, opt_state, score_model)
        losses.append(loss)
    if (k+1) % 1000 == 0:
        mean_loss = jnp.mean(jnp.array(losses))
        print("Epoch %d \t, Loss %f " % (k+1, mean_loss))
        losses = []

Epoch 1000 	, Loss 0.543068 
Epoch 2000 	, Loss 0.493895 
Epoch 3000 	, Loss 0.475762 
Epoch 4000 	, Loss 0.474409 
Epoch 5000 	, Loss 0.450042 
Epoch 6000 	, Loss 0.447567 
Epoch 7000 	, Loss 0.447468 
Epoch 8000 	, Loss 0.442307 
Epoch 9000 	, Loss 0.428227 
Epoch 10000 	, Loss 0.431427


trained_score = lambda x, t: score_model.apply(params, x, t)
rng, step_rng = random.split(rng)
samples = reverse_sde(step_rng, 2, 1000, drift, dispersion, trained_score)
plot_heatmap(samples)


N_epochs = 30_000
train_size = mf.shape[0]
batch_size = train_size
# batch_size = 5
steps_per_epoch = train_size // batch_size
losses = []
for k in range(N_epochs):
    rng, step_rng = random.split(rng)
    perms = jax.random.permutation(step_rng, train_size)
    perms = perms[:steps_per_epoch * batch_size]  # skip incomplete batch
    perms = perms.reshape((steps_per_epoch, batch_size))
    for perm in perms:
        batch = mf[perm, :]
        rng, step_rng = random.split(rng)
        loss, params, opt_state = update_step(params, step_rng, batch, opt_state, score_model)
        losses.append(loss)
    if (k+1) % 1000 == 0:
        mean_loss = jnp.mean(jnp.array(losses))
        print("Epoch %d \t, Loss %f " % (k+1, mean_loss))
        losses = []

Epoch 1000 	, Loss 0.381477 
Epoch 2000 	, Loss 0.378971 
Epoch 3000 	, Loss 0.359203 
Epoch 4000 	, Loss 0.361598 
Epoch 5000 	, Loss 0.373241 
Epoch 6000 	, Loss 0.363755 
Epoch 7000 	, Loss 0.370942 
Epoch 8000 	, Loss 0.380243 
Epoch 9000 	, Loss 0.367945 
Epoch 10000 	, Loss 0.370489 
Epoch 11000 	, Loss 0.362878 
Epoch 12000 	, Loss 0.358997 
Epoch 13000 	, Loss 0.366488 
Epoch 14000 	, Loss 0.362778 
Epoch 15000 	, Loss 0.349705 
Epoch 16000 	, Loss 0.372878 
Epoch 17000 	, Loss 0.354509 
Epoch 18000 	, Loss 0.368619 
Epoch 19000 	, Loss 0.366718 
Epoch 20000 	, Loss 0.363335 
Epoch 21000 	, Loss 0.369207 
Epoch 22000 	, Loss 0.367098 
Epoch 23000 	, Loss 0.371303 
Epoch 24000 	, Loss 0.367577 
Epoch 25000 	, Loss 0.355797 
Epoch 26000 	, Loss 0.344280 
Epoch 27000 	, Loss 0.358146 
Epoch 28000 	, Loss 0.354443 
Epoch 29000 	, Loss 0.361874 
Epoch 30000 	, Loss 0.358426


trained_score = lambda x, t: score_model.apply(params, x, t)
rng, step_rng = random.split(rng)
samples = reverse_sde(step_rng, 2, 1000, drift, dispersion, trained_score)
plot_heatmap(samples)


$m_t$	$\exp(-\frac{1}{2}\alpha_t)$
$v_t$	$1 - \exp(-\alpha_t)$	.

Score-based generative models¶

How do SGMs work¶

Forward SDE¶

Reverse SDE¶

The problems¶

Outline of the notebook¶

Generating Samples from $\mu_\text{data}$¶

The forward SDE¶

Ornstein-Uhlenbeck process (OU process)¶

Time-changed OU process¶

Marginals of the time-changed OU-process¶

Helper function: Plot score¶

Implementing the reverse SDE¶

Intuition¶

Implementation¶

Helper function: Plot marginals¶

Running the reverse SDE with the empirical drift¶

Bounded perturbation of the score¶

Neural Network Training¶

The loss¶

Remark: A short intermezzo on generalization/memorization in SGMs¶

Rewriting the loss to make it cheaper to evaluate¶

Hyperparameters for training and model initialization¶

Loss function and Update Step¶

NN Training¶

Plotting the results¶

Things to try out¶